import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
df_read = pd.read_csv('Desktop/amz_uk_processed_data.csv')
df_read.head(3)
| asin | title | imgUrl | productURL | stars | reviews | price | isBestSeller | boughtInLastMonth | categoryName | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | B09B96TG33 | Echo Dot (5th generation, 2022 release) | Big ... | https://m.media-amazon.com/images/I/71C3lbbeLs... | https://www.amazon.co.uk/dp/B09B96TG33 | 4.7 | 15308 | 21.99 | False | 0 | Hi-Fi Speakers |
| 1 | B01HTH3C8S | Anker Soundcore mini, Super-Portable Bluetooth... | https://m.media-amazon.com/images/I/61c5rSxwP0... | https://www.amazon.co.uk/dp/B01HTH3C8S | 4.7 | 98099 | 23.99 | True | 0 | Hi-Fi Speakers |
| 2 | B09B8YWXDF | Echo Dot (5th generation, 2022 release) | Big ... | https://m.media-amazon.com/images/I/61j3SEUjMJ... | https://www.amazon.co.uk/dp/B09B8YWXDF | 4.7 | 15308 | 21.99 | False | 0 | Hi-Fi Speakers |
df_read.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2222742 entries, 0 to 2222741 Data columns (total 10 columns): # Column Dtype --- ------ ----- 0 asin object 1 title object 2 imgUrl object 3 productURL object 4 stars float64 5 reviews int64 6 price float64 7 isBestSeller bool 8 boughtInLastMonth int64 9 categoryName object dtypes: bool(1), float64(2), int64(2), object(5) memory usage: 154.7+ MB
df = df_read.copy()
df.isnull().sum()
asin 0 title 0 imgUrl 0 productURL 0 stars 0 reviews 0 price 0 isBestSeller 0 boughtInLastMonth 0 categoryName 0 dtype: int64
df.columns
columns_to_drop = ['asin', 'imgUrl', 'productURL']
df = df.drop(columns=columns_to_drop)
df['isBestSeller'] = df['isBestSeller'].astype(int)
df.head(3)
| title | stars | reviews | price | isBestSeller | boughtInLastMonth | categoryName | |
|---|---|---|---|---|---|---|---|
| 0 | Echo Dot (5th generation, 2022 release) | Big ... | 4.7 | 15308 | 21.99 | 0 | 0 | Hi-Fi Speakers |
| 1 | Anker Soundcore mini, Super-Portable Bluetooth... | 4.7 | 98099 | 23.99 | 1 | 0 | Hi-Fi Speakers |
| 2 | Echo Dot (5th generation, 2022 release) | Big ... | 4.7 | 15308 | 21.99 | 0 | 0 | Hi-Fi Speakers |
df['boughtInLastMonth'].unique()
array([ 0, 600, 1000, 200, 100, 50, 2000, 800, 900,
700, 500, 400, 300, 4000, 3000, 5000, 9000, 7000,
6000, 50000, 20000, 10000, 8000, 30000, 40000])
df['categoryName'].nunique()
296
# Count the occurrences of True and False in the 'isBestSeller' column
is_bestseller_count = df['isBestSeller'].value_counts()
# Create a DataFrame for the pie chart
is_bestseller_data = pd.DataFrame({'isBestSeller': is_bestseller_count.index, 'Count': is_bestseller_count.values})
# Define custom labels
custom_labels = {0: "Not Bestseller", 1: "Bestseller"}
# Create an interactive pie chart with custom labels
fig = px.pie(is_bestseller_data, names='isBestSeller', values='Count', title='isBestSeller Count')
# Show the interactive pie chart
fig.show()
# Count the occurrences in the 'boughtInLastMonth' column
boughtInLastMonth_count = df['boughtInLastMonth'].value_counts()
# Combine values after the first 5 into 'Others' because they are too few to show.
top_5_counts = boughtInLastMonth_count[:5]
other_counts = boughtInLastMonth_count[5:]
# Sum the counts of the 'Others'
other_total = other_counts.sum()
# Create a DataFrame for the 'Others' category
others_data = pd.DataFrame({'boughtInLastMonth': ['Others'], 'Count': [other_total]})
# Create an interactive pie chart
fig = px.pie(boughtInLastMonth_data, names='boughtInLastMonth', values='Count', title='boughtInLastMonth Count')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[33], line 2 1 # Create an interactive pie chart ----> 2 fig = px.pie(boughtInLastMonth_data, names='boughtInLastMonth', values='Count', title='boughtInLastMonth Count') NameError: name 'boughtInLastMonth_data' is not defined
# Show the interactive pie chart
fig.show()